×
n
i
i
+
j
j
t
t
−
1
n
f
(
x
)
f
(
x
)
f
x
f
(
x
)
x
∈
R
r
×
c
×
3
X
i,j,
0
i
j
X
i,j,
1
X
i,j,
2
1
3
r
c
r
i
=1
c
j
=1
3
k
=1
X
i,j,k
−
¯
2
¯
¯
=
1
3
r
c
r
i
=1
c
j
=1
3
k
=1
X
i,j,k
.
s
λ
X
i,j,k
=
s
X
i,j,k
−
¯
X
max
,
λ
+
1
3
rc
r
i
=1
c
j
=1
3
k
=1
x
i,j,k
−
¯
X
2
.
−2.0
−1.5
−1.0
−0.5
0.0
0.5
1.0
1.5
2.0
x
0
−1.5
−1.0
−0.5
0.0
0.5
1.0
1.5
2.0
x
0
Raw inp
ut
λ
=
0
λ
>
0
λ
=
0
10
−
8
= 0
λ
= 10
s
1
L
2
s
X
=
(
x
1
,
x
2
,
.
.
.
x
T
)
y
= (
y
1
,
y
2
,
.
.
.
y
N
)
f
ASR
≈
f
∗
ASR
f
∗
ASR
y
X
f
∗
ASR
(
X
)
= arg
max
y
P
∗
(
y
|
X
=
X
)
P
∗
X
y
n
n
n
n
n
n
n
n
n
n
n
P
(
x
t
|
x
1
,
.
.
.
,
x
t
−
1
)
P
(
x
1
,
.
.
.
,
x
T
)
n
n
−
1
x
t
V
n
n
n
n
n
n
n
n
n
p
n
(
x
1
,
.
.
.
,
x
n
)
n
n
n
n
−
1
p
(
x
t
|
x
t
−
n
+1
,
.
.
.
,
x
t
) =
p
n
(
x
t
−
n
+1
,
.
.
.
,
x
t
)
/p
n
−
1
(
x
t
−
n
+1
,
x
t
−
1
)
p
n
p
n
−
1
p
3
(
THE
DOG
RAN
)
p
(
AWAY
|
DOG
RAN
)
p
(
THE
DOG
RAN
AWAY
) =
p
3
(
THE
DOG
RAN
)
p
3
(
DOG
RAN
AWAY
)
/p
2
(
DOG
RAN
)
.
n
p
n
(
x
t
−
n
+1
,
.
.
.
,
x
t
)
p
n
−
1
p
n
−
1
p
n
−∞
n
n
n
n
x
t
−
1
,
.
.
.
,
x
t
−
n
+1
x
t
x
t
−
n
+
k
,
.
.
.
,
x
t
−
1
k
n
|
V
|
n
n
n
n
n
n
|
V
|
n
n
n
n
√
2
V
|
V
|
|
V
|
|
V
|
h
p
h
p
W
b
a
i
=
b
i
+
j
W
ij
h
j
∀
i
∈
{
1
,
.
.
.
,
|
V
|}
,
p
i
=
e
a
i
i
∈{
1
,...,
|
V
|}
e
a
i
.
h
n
h
O
(
|
V
|
n
h
)
n
h
|
V
|
V
L
T
=
V
\
L
n
C
P
(
i
∈
T
|
C
)
V
P
(
y
=
i
|
C
)
=1
i
∈
L
P
(
y
=
i
|
C,
i
∈
L
)(1
−
P
(
i
∈
T
|
C
))
+
1
i
∈
T
P
(
y
=
i
|
C,
i
∈
T
)
P
(
i
∈
T
|
C
)
P
(
y
=
i
|
C,
i
∈
L
)
P
(
y
=
i
|
C,
i
∈
T
)
n
|
V
|
|
V
|
n
h
|
V
|
log
|
V
|
1
0
000
001
010
011
100
101
110
111
11
10
01
00
w
0
w
0
w
1
w
1
w
2
w
2
w
3
w
3
w
4
w
4
w
5
w
5
w
6
w
6
w
7
w
7
|
V
|
|
V
|
O
(log
|
V
|
)
w
0
,
.
.
.
,
w
7
(
w
0
,
w
1
)
(
w
2
,
w
3
)
(
w
4
,
w
5
)
(
w
6
,
w
7
)
y
y
b
i
i
y
10
(
b
0
(
w
4
) =
1
,
b
1
(
w
4
) = 0)
w
4
P
(
=
w
4
) =
P
(
0
= 1
,
1
= 0
,
2
= 0)
=
P
(
0
= 1)
P
(
1
= 0
|
0
= 1)
P
(
2
= 0
|
0
= 1
,
1
= 0)
.
y
log
P
(
y
|
C
) =
i
log
P
(
b
i
|
b
1
,
b
2
,
.
.
.
,
b
i
−
1
,
C
)
k
y
k
n
=
b
1
,
b
2
,
.
.
.
,
b
i
−
1
n
y
p
n
= sigmoid(
c
n
+
v
n
·
h
C
)
log
P
(
b
i
|
b
1
,
b
2
,
.
.
.
,
b
i
−
1
,
C
) =
b
i
log
p
n
+
(1
−
b
i
)
log(1
−
p
n
)
log
|
V
|
|
V
|
v
n
c
n
h
C
n
h
O
(
n
2
h
)
O
(
n
h
L
)
L
L
n
h
n
h
log
2
(10
6
)
L
L
n
h
L
|
V
|
|
V
|
∂
log
P
(
y
|
C
)
∂
θ
=
∂
log
softmax
y
(
a
)
∂
θ
=
∂
∂
θ
log
e
a
y
i
e
a
i
=
∂
∂
θ
(
a
y
−
log
i
e
a
i
)
=
∂
a
y
∂
θ
−
i
P
(
i
|
C
)
∂
a
i
∂
θ
a
a
y
a
i
i
P
(
i
|
C
)
P
(
i
|
C
)
i
q
p
i
/q
i
p
i
=
P
(
i
|
C
)
a
i
n
i
w
i
=
p
n
i
/q
n
i
N
j
=1
p
n
j
/q
n
j
.
N
q
|
V
|
i
=1
P
(
i
|
C
)
∂
a
i
∂
θ
)
≈
1
N
N
i
=1
w
i
∂
a
n
i
∂
θ
.
q
a
y
a
i
L
=
i
max(0
,
1
−
a
y
+
a
i
)
.
a
y
a
i
y
a
y
n
n
n
n
n
n
|
sV
|
n
n
n
n
n
In
termediate, semantic represen
tation
Enco
der
Deco
der
Output ob
ject, e.g. English sentence
Source ob
ject, e.g. F
renc
h sen
tence or image
P
(
1
,
2
,
.
.
.
,
T
)
C
C
1
,
2
,
.
.
.
,
k
1
,
2
,
.
.
.
,
n
P
(
1
,
2
,
.
.
.
,
k
|
1
,
2
,
.
.
.
,
n
)
n
P
(
1
,
2
,
.
.
.
,
T
)
C
C
C
P
(
Y
|
ω
)
Y
=
(
y
1
,
.
.
.
,
y
k
)
ω
P
(
Y
|
X
)
X
ω
X
x
t
h
t
h
t
=
f
(
h
t
−
1
,
x
t
)
.
x
t
+1
ω
P
(
x
t
+1
=
x
t
+1
|
ω
t
)
.
ω
t
= (
ω
RBM
,
ω
t
)
ω
RBM
ω
t
h
t
ω
t
=
g
(
h
t
)
.
f
g
ω
RBM
ω
t
P
(
x
1
,
.
.
.
,
x
T
) =
t
P
(
x
t
+1
|
x
t
,
x
t
−
1
,
.
.
.
,
x
1
)
=
t
P
(
x
t
+1
|
ω
t
) =
t
P
(
x
t
+1
|
h
t
)
.
δ
ω
δ
ω
≈
∇
ω
log
P
(
x
t
+1
=
x
t
+1
|
ω
)
δ
ω
ω
t
ω
RBM
ω
t
δ
ω
δ
ω
RBM
δ
ω
t
δ
ω
RBM
ω
RBM
δ
ω
t
log
P
(
x
t
+1
=
x
t
+1
|
ω
t
)
ω
t
f
g
u
u
ˆ
R
A
B
b
c
ˆ
R
u,i
=
b
u
+
c
i
+
j
A
u,j
B
j,i
.
ˆ
R
u,i
R
u,i
R
R
=
U
D
V
A
=
U
D
B
=
V
x
a
a
a
a
2
0
{
1
,
2
,
3
}
S
=
{
(1
,
2)
,
(1
,
3)
,
(2
,
3)
}
(1
,
2)
∈
S
(2
,
1)
∈
S
(sub
ject
,
v
erb
,
ob
ject
(en
tit
y
i
,
relation
j
,
en
tit
y
k
)
.
(en
tit
y
i
,
attribute
j
)
.